In [1]:
import pandas as pd
import datetime as dt
import os
import openmeteo_requests
import requests_cache
In [3]:
file_path = os.path.join(r"c:\\Users\\youss\\Documents\\University of Calgary\\DATA 601\\Group Project\\UofC601L02_Project\\data", 'Total_Data_10Y_Top23.csv')
us_avaiation_data = pd.read_csv(file_path)
us_avaiation_data['FL_DATE'] = pd.to_datetime(us_avaiation_data['FL_DATE'])
us_avaiation_data['TOTAL_DELAY'] = us_avaiation_data['WEATHER_DELAY'] + us_avaiation_data['LATE_AIRCRAFT_DELAY'] + us_avaiation_data['CARRIER_DELAY'] + us_avaiation_data['NAS_DELAY']
In [6]:
us_avaiation_data.head(-5)
Out[6]:
| FL_DATE | OP_UNIQUE_CARRIER | TAIL_NUM | ORIGIN_AIRPORT_SEQ_ID | ORIGIN_CITY_MARKET_ID | ORIGIN | ORIGIN_CITY_NAME | DEST_AIRPORT_SEQ_ID | DEST_CITY_MARKET_ID | DEST | ... | ARR_TIME | ARR_DELAY | CANCELLED | CANCELLATION_CODE | CARRIER_DELAY | WEATHER_DELAY | NAS_DELAY | SECURITY_DELAY | LATE_AIRCRAFT_DELAY | TOTAL_DELAY | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | AA | N002AA | 1105703 | 31057 | CLT | Charlotte, NC | 1129803 | 30194 | DFW | ... | 1214.0 | 9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | 2014-07-01 | AA | N002AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1105703 | 31057 | CLT | ... | 945.0 | 0.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | 2014-07-01 | AA | N004AA | 1039705 | 30397 | ATL | Atlanta, GA | 1129803 | 30194 | DFW | ... | 1341.0 | -9.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 1159.0 | 4.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 2014-07-01 | AA | N004AA | 1129803 | 30194 | DFW | Dallas/Fort Worth, TX | 1039705 | 30397 | ATL | ... | 2317.0 | 2.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 12926546 | 2018-12-31 | YX | N748YX | 1161802 | 31703 | EWR | Newark, NJ | 1129806 | 30194 | DFW | ... | 1934.0 | 1.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12926547 | 2018-12-31 | YX | N858RW | 1161802 | 31703 | EWR | Newark, NJ | 1449202 | 34492 | RDU | ... | 1243.0 | -10.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12926548 | 2018-12-31 | YX | N858RW | 1449202 | 34492 | RDU | Raleigh/Durham, NC | 1393007 | 30977 | ORD | ... | 1450.0 | -5.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12926549 | 2018-12-31 | YX | N863RW | 1129806 | 30194 | DFW | Dallas/Fort Worth, TX | 1161802 | 31703 | EWR | ... | 1758.0 | -13.0 | 0.0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 12926550 | 2018-12-31 | YX | N863RW | 1393007 | 30977 | ORD | Chicago, IL | 1129806 | 30194 | DFW | ... | 1325.0 | 25.0 | 0.0 | NaN | 0.0 | 0.0 | 25.0 | 0.0 | 0.0 | 25.0 |
12926551 rows × 25 columns
DELAYS¶
In [147]:
weather_delay = us_avaiation_data[us_avaiation_data['CANCELLATION_CODE'].isna()]
# Extracting Month and Year to Join with Monthly Weather Conditions
weather_delay['FL_MONTH'] = weather_delay['FL_DATE'].dt.month
weather_delay['FL_YEAR'] = weather_delay['FL_DATE'].dt.year
# if delay is NaN we want to show this as 0
weather_delay['TOTAL_DELAY'] = weather_delay['TOTAL_DELAY'].fillna(0.0)
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\234183179.py:2: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy weather_delay['FL_MONTH'] = weather_delay['FL_DATE'].dt.month C:\Users\youss\AppData\Local\Temp\ipykernel_24372\234183179.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy weather_delay['FL_YEAR'] = weather_delay['FL_DATE'].dt.year C:\Users\youss\AppData\Local\Temp\ipykernel_24372\234183179.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy weather_delay['TOTAL_DELAY'] = weather_delay['TOTAL_DELAY'].fillna(0.0)
In [148]:
# rearragning columns to have Y value at the end
weather_delay = weather_delay[['FL_YEAR', 'FL_MONTH', 'FL_DATE', 'ORIGIN', 'DEST', 'DEST_CITY_NAME',
'DEP_TIME', 'DEP_DELAY', 'ARR_TIME', 'ARR_DELAY',
'CANCELLED', 'CANCELLATION_CODE', 'CARRIER_DELAY', 'WEATHER_DELAY',
'NAS_DELAY', 'SECURITY_DELAY', 'LATE_AIRCRAFT_DELAY', 'TOTAL_DELAY']]
#if not arr time or dep time meaning the flight never happened
weather_delay = weather_delay.dropna(subset=['ARR_TIME'])
weather_delay = weather_delay.dropna(subset=['DEP_TIME'])
#checking sums of na arr and dep times
weather_delay['ARR_TIME'].isna().sum()
weather_delay['DEP_TIME'].isna().sum()
Out[148]:
np.int64(0)
In [152]:
def rounding_time(x):
"""
Rounding Dep and Arr Time to the nearest Hour +/-
Arguments:
x -- time string variable
Returns:
int: rounded arr/ dep time to the nearest hour +/-
"""
time_str = f"{int(x):04d}"
if len(time_str) != 4:
return None
hour = int(time_str[:2])
minute = int(time_str[2:])
if 0 <= hour < 24 and 0 <= minute < 60:
if minute >= 30:
hour += 1
if hour == 24:
hour = 0
minute = 0
else:
minute = 0
return f"{hour:02d}{minute:02d}"
else:
return None
weather_delay['ARR_TIME_STR_ROUNDED'] = weather_delay['ARR_TIME'].apply(rounding_time)
weather_delay['DEP_TIME_STR_ROUNDED'] = weather_delay['DEP_TIME'].apply(rounding_time)
# drop invalid formats
weather_delay = weather_delay.dropna(subset=['ARR_TIME_STR_ROUNDED'])
weather_delay = weather_delay.dropna(subset=['DEP_TIME_STR_ROUNDED'])
weather_delay['ARR_TIME_DT'] = pd.to_datetime(weather_delay['ARR_TIME_STR_ROUNDED'], format='%H%M', errors='coerce').dt.time
weather_delay['DEP_TIME_DT'] = pd.to_datetime(weather_delay['DEP_TIME_STR_ROUNDED'], format='%H%M', errors='coerce').dt.time
In [154]:
# formatting time to deseriable format
weather_delay['ARR_TIME_DT'] = pd.to_datetime(weather_delay['ARR_TIME_DT'], format='%H:%M:%S').dt.time
weather_delay['DEP_TIME_DT'] = pd.to_datetime(weather_delay['DEP_TIME_DT'], format='%H:%M:%S').dt.time
weather_delay['ARR_HOUR'] = pd.to_datetime(weather_delay['ARR_TIME_DT'], format='%H:%M:%S').dt.hour
weather_delay['ARR_HOUR'] = weather_delay['ARR_HOUR'].apply((lambda x: f'{x:02d}:00:00'))
weather_delay['DEP_HOUR'] = pd.to_datetime(weather_delay['DEP_TIME_DT'], format='%H:%M:%S').dt.hour
weather_delay['DEP_HOUR'] = weather_delay['DEP_HOUR'].apply((lambda x: f'{x:02d}:00:00'))
In [156]:
# Grouping Flights by Date , Hour and Dest/Origin
arr_delay_sum = weather_delay.groupby(by = ['FL_DATE','ARR_HOUR','DEST'])['TOTAL_DELAY'].sum().reset_index()
dep_delay_sum = weather_delay.groupby(by =['FL_DATE','DEP_HOUR','ORIGIN'])['TOTAL_DELAY'].sum().reset_index()
# renaming columns to prepare to merge with weather dataset
arr_delay_sum = arr_delay_sum.rename(columns={'TOTAL_DELAY': "total_delay_arr"})
dep_delay_sum = dep_delay_sum.rename(columns={'TOTAL_DELAY': "total_delay_dep"})
weather_delay = pd.merge(arr_delay_sum, dep_delay_sum, left_on=['FL_DATE','ARR_HOUR', 'DEST'] , right_on=['FL_DATE','DEP_HOUR','ORIGIN'], how='inner')
weather_delay['TOTAL_DELAY'] = weather_delay['total_delay_arr'].fillna(0) + weather_delay['total_delay_dep'].fillna(0)
# Dont need origin anymore as the same airport has all departing and arriving flights
weather_delay.drop(columns=['ORIGIN'], inplace=True)
weather_delay.rename(columns={'DEST': 'Airport'}, inplace=True)
In [159]:
# Checking results
weather_delay
Out[159]:
| FL_DATE | ARR_HOUR | Airport | total_delay_arr | DEP_HOUR | total_delay_dep | TOTAL_DELAY | |
|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | 00:00:00 | DEN | 28.0 | 00:00:00 | 0.0 | 28.0 |
| 1 | 2014-07-01 | 00:00:00 | HOU | 55.0 | 00:00:00 | 95.0 | 150.0 |
| 2 | 2014-07-01 | 00:00:00 | JFK | 682.0 | 00:00:00 | 0.0 | 682.0 |
| 3 | 2014-07-01 | 00:00:00 | LAS | 141.0 | 00:00:00 | 98.0 | 239.0 |
| 4 | 2014-07-01 | 00:00:00 | LAX | 87.0 | 00:00:00 | 104.0 | 191.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 1527927 | 2024-06-30 | 23:00:00 | SFO | 340.0 | 23:00:00 | 132.0 | 472.0 |
| 1527928 | 2024-06-30 | 23:00:00 | SJC | 75.0 | 23:00:00 | 0.0 | 75.0 |
| 1527929 | 2024-06-30 | 23:00:00 | SJU | 73.0 | 23:00:00 | 0.0 | 73.0 |
| 1527930 | 2024-06-30 | 23:00:00 | SMF | 124.0 | 23:00:00 | 0.0 | 124.0 |
| 1527931 | 2024-06-30 | 23:00:00 | STL | 154.0 | 23:00:00 | 0.0 | 154.0 |
1527932 rows × 7 columns
In [6]:
# Reading dataset that obtains airport Lat and Long to use in weather api
airport_lat_long_path = os.path.join(r"c:\\Users\\youss\\Documents\\University of Calgary\\DATA 601\\Group Project\\UofC601L02_Project\supplementary", "dataset_airport_geo_codes.csv")
airport_geo_codes = pd.read_csv(airport_lat_long_path)
In [7]:
# Getting the Geocodes of the airports we are analyzing
unique_airports = us_avaiation_data['ORIGIN'].unique()
unique_airports_mask = airport_geo_codes['AIRPORT'].isin(unique_airports)
needed_airport_codes = airport_geo_codes[unique_airports_mask]
needed_airport_codes['DISPLAY_AIRPORT_NAME'] = needed_airport_codes['DISPLAY_AIRPORT_NAME'].str.strip()
needed_airport_codes['DISPLAY_AIRPORT_CITY_NAME_FULL'] = needed_airport_codes['DISPLAY_AIRPORT_CITY_NAME_FULL'].str.strip()
needed_airport_codes['DISPLAY_CITY_MARKET_NAME_FULL'] = needed_airport_codes['DISPLAY_CITY_MARKET_NAME_FULL'].str.strip()
needed_airport_codes['LATITUDE'] = needed_airport_codes['LATITUDE'].round(2)
needed_airport_codes['LONGITUDE'] = needed_airport_codes['LONGITUDE'].round(2)
needed_airport_codes_cleaned = needed_airport_codes.drop_duplicates(subset=['DISPLAY_AIRPORT_NAME', 'DISPLAY_AIRPORT_CITY_NAME_FULL',
'DISPLAY_CITY_MARKET_NAME_FULL', 'LATITUDE', 'LONGITUDE'])
In [8]:
# intially saved the file after transformation, indicated by commented code
save_path = os.path.join(r"c:\\Users\\youss\\Documents\\University of Calgary\\DATA 601\\Group Project\\UofC601L02_Project\supplementary", 'dataset_airport_geo_codes.csv')
# needed_airport_codes_cleaned.to_csv(save_path)
dataset_airport_geo_codes = pd.read_csv(save_path)
geo_code_tuple_list = [(x,y) for x , y in zip(dataset_airport_geo_codes['LATITUDE'], dataset_airport_geo_codes['LONGITUDE'])]
dataset_airport_codes = [ x for x in dataset_airport_geo_codes['AIRPORT']]
In [16]:
# this skeletion code is given to us by https://open-meteo.com/ thank god!
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
"latitude": [33.64, 35.21, 32.85, 39.86, 32.9, 40.69, 29.64, 40.64, 36.08, 33.94, 28.43, 25.79, 29.99, 41.97, 45.59, 33.43, 35.88, 47.45, 37.62, 37.36, 18.44, 38.7, 33.68, 38.75],
"longitude": [-84.43, -80.95, -96.85, -104.67, -97.04, -74.17, -95.28, -73.78, -115.15, -118.41, -81.31, -80.29, -90.25, -87.91, -122.6, -112.01, -78.78, -122.31, -122.38, -121.94, -66, -121.59, -117.87, -90.37],
"hourly": ["rain", "snowfall", "weather_code", "visibility", "wind_speed_10m", "wind_speed_120m", "wind_direction_10m", "wind_direction_120m", "wind_gusts_10m"],
"start_date": "2014-07-01",
"end_date": "2024-06-30",
}
responses = openmeteo.weather_api(url, params=params)
dataset_airport_codes = [ x for x in dataset_airport_geo_codes['AIRPORT']]
airport_weather_dataframe_list = []
for row , airport_code in zip(range(len(responses)), dataset_airport_codes):
# Process first location. Add a for-loop for multiple locations or weather models
response = responses[row]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
# Process daily data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_rain = hourly.Variables(0).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(1).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(2).ValuesAsNumpy()
hourly_visibility = hourly.Variables(3).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(4).ValuesAsNumpy()
hourly_wind_speed_120m = hourly.Variables(5).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(6).ValuesAsNumpy()
hourly_wind_direction_120m = hourly.Variables(7).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(8).ValuesAsNumpy()
hourly_data = {"date": pd.date_range(
start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
freq = pd.Timedelta(seconds = hourly.Interval()),
inclusive = "left"
)}
hourly_data['airport_code'] = airport_code
hourly_data["rain"] = hourly_rain
hourly_data["snowfall"] = hourly_snowfall
hourly_data["weather_code"] = hourly_weather_code
hourly_data["visibility"] = hourly_visibility
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_speed_120m"] = hourly_wind_speed_120m
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_direction_120m"] = hourly_wind_direction_120m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m
hourly_dataframe = pd.DataFrame(data = hourly_data)
airport_weather_dataframe_list.append(hourly_dataframe)
Coordinates 33.63795852661133°N -84.4168701171875°E Elevation 305.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 35.184532165527344°N -80.93112182617188°E Elevation 213.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 32.86467361450195°N -96.82986450195312°E Elevation 146.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 39.89455032348633°N -104.68618774414062°E Elevation 1633.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 32.93497085571289°N -97.05880737304688°E Elevation 178.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 40.6678352355957°N -74.19265747070312°E Elevation 2.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 29.630929946899414°N -95.31866455078125°E Elevation 12.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 40.6678352355957°N -73.81021118164062°E Elevation 3.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 36.09841537475586°N -115.09727478027344°E Elevation 641.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 33.91915512084961°N -118.39152526855469°E Elevation 36.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 28.43585205078125°N -81.30682373046875°E Elevation 33.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 25.764497756958008°N -80.29409790039062°E Elevation 2.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 29.982423782348633°N -90.20977783203125°E Elevation -1.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 42.00351333618164°N -87.9039306640625°E Elevation 198.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 45.588748931884766°N -122.54718017578125°E Elevation 5.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 33.42706298828125°N -112.02719116210938°E Elevation 340.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 35.88751983642578°N -78.83718872070312°E Elevation 129.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 47.41651916503906°N -122.31147766113281°E Elevation 120.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 37.644989013671875°N -122.44325256347656°E Elevation 1.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 37.36379623413086°N -121.91233825683594°E Elevation 16.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 18.453426361083984°N -65.95889282226562°E Elevation 4.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 38.69947052001953°N -121.63487243652344°E Elevation 6.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 33.70825958251953°N -117.8385009765625°E Elevation 15.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 38.769771575927734°N -90.36834716796875°E Elevation 161.0 m asl Timezone None None Timezone difference to GMT+0 0 s
In [17]:
#concating dataframes
airport_hist_weather_data = pd.concat(airport_weather_dataframe_list, axis = 0)
In [18]:
airport_hist_weather_data
Out[18]:
| date | airport_code | rain | snowfall | weather_code | visibility | wind_speed_10m | wind_speed_120m | wind_direction_10m | wind_direction_120m | wind_gusts_10m | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 00:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | NaN | 3.758510 | NaN | 196.699326 | NaN | 11.879999 |
| 1 | 2014-07-01 01:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | NaN | 4.334974 | NaN | 175.236450 | NaN | 6.120000 |
| 2 | 2014-07-01 02:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | NaN | 4.829907 | NaN | 206.564987 | NaN | 7.559999 |
| 3 | 2014-07-01 03:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | NaN | 5.241679 | NaN | 195.945465 | NaN | 8.640000 |
| 4 | 2014-07-01 04:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | NaN | 6.120000 | NaN | 180.000000 | NaN | 10.080000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 87667 | 2024-06-30 19:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | NaN | 13.004921 | NaN | 4.763556 | NaN | 29.519999 |
| 87668 | 2024-06-30 20:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | NaN | 14.058450 | NaN | 2.935620 | NaN | 30.960001 |
| 87669 | 2024-06-30 21:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | NaN | 14.869351 | NaN | 6.952864 | NaN | 31.680000 |
| 87670 | 2024-06-30 22:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | NaN | 15.745627 | NaN | 10.539137 | NaN | 33.119999 |
| 87671 | 2024-06-30 23:00:00+00:00 | STL | 0.0 | 0.0 | 1.0 | NaN | 15.281989 | NaN | 15.018415 | NaN | 32.399998 |
2104128 rows × 11 columns
In [19]:
# adding the WMO weather descriptions
wmo_weather_code_path = os.path.join(r"c:\\Users\\youss\\Documents\\University of Calgary\\DATA 601\\Group Project\\UofC601L02_Project\supplementary",'wmo_weather_codes.csv')
weather_codes = pd.read_csv(wmo_weather_code_path)
weather_codes['Code'] = weather_codes['Code'].astype(float)
weather_codes = weather_codes.rename(columns={'Description':'Weather Code Description', 'Code':'weather_code'})
airport_hist_weather_data = pd.merge(airport_hist_weather_data, weather_codes, how="left", on='weather_code')
airport_hist_weather_data = airport_hist_weather_data[['date', 'airport_code', 'rain', 'snowfall', 'weather_code', 'Weather Code Description',
'visibility', 'wind_speed_10m', 'wind_speed_120m', 'wind_direction_10m',
'wind_direction_120m', 'wind_gusts_10m']]
In [168]:
airport_hist_weather_data
Out[168]:
| date | airport_code | rain | snowfall | weather_code | Weather Code Description | visibility | wind_speed_10m | wind_speed_120m | wind_direction_10m | wind_direction_120m | wind_gusts_10m | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 00:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | NaN | 3.758510 | NaN | 196.699326 | NaN | 11.879999 |
| 1 | 2014-07-01 01:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | NaN | 4.334974 | NaN | 175.236450 | NaN | 6.120000 |
| 2 | 2014-07-01 02:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | NaN | 4.829907 | NaN | 206.564987 | NaN | 7.559999 |
| 3 | 2014-07-01 03:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | NaN | 5.241679 | NaN | 195.945465 | NaN | 8.640000 |
| 4 | 2014-07-01 04:00:00+00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | NaN | 6.120000 | NaN | 180.000000 | NaN | 10.080000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2104123 | 2024-06-30 19:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | NaN | 13.004921 | NaN | 4.763556 | NaN | 29.519999 |
| 2104124 | 2024-06-30 20:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | NaN | 14.058450 | NaN | 2.935620 | NaN | 30.960001 |
| 2104125 | 2024-06-30 21:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | NaN | 14.869351 | NaN | 6.952864 | NaN | 31.680000 |
| 2104126 | 2024-06-30 22:00:00+00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | NaN | 15.745627 | NaN | 10.539137 | NaN | 33.119999 |
| 2104127 | 2024-06-30 23:00:00+00:00 | STL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | NaN | 15.281989 | NaN | 15.018415 | NaN | 32.399998 |
2104128 rows × 12 columns
In [169]:
# chaning date format in from the weather api data
airport_hist_weather_data['Hour'] = airport_hist_weather_data['date'].dt.hour
airport_hist_weather_data['date'] = airport_hist_weather_data['date'].dt.date
airport_hist_weather_data['Hour'] = airport_hist_weather_data['Hour'].apply((lambda x: f'{x:02d}:00:00'))
airport_hist_weather_data = airport_hist_weather_data[['date','Hour','airport_code', 'rain', 'snowfall', 'weather_code', 'Weather Code Description',
'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m']]
airport_hist_weather_data['date'] = pd.to_datetime(airport_hist_weather_data['date'])
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2402388856.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy airport_hist_weather_data['date'] = pd.to_datetime(airport_hist_weather_data['date'])
In [179]:
# making sure column names are the same for the merge
weather_delay_renamed = weather_delay.rename(columns={'FL_DATE':'date','ARR_HOUR':'Hour','Airport':'airport_code'})
weather_delay_renamed.drop(columns=['DEP_HOUR'], inplace=True)
airport_hist_weather_delay = pd.merge(left=airport_hist_weather_data, right=weather_delay_renamed , how='left', on= ['date', 'Hour', 'airport_code'])
In [296]:
airport_hist_weather_delay
Out[296]:
| date | Hour | airport_code | rain | snowfall | weather_code | Weather Code Description | wind_speed_10m | wind_direction_10m | wind_gusts_10m | total_delay_arr | total_delay_dep | TOTAL_DELAY | Month | Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | 00:00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | 3.758510 | 196.699326 | 11.879999 | NaN | NaN | NaN | 7 | 2014 |
| 1 | 2014-07-01 | 01:00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | 4.334974 | 175.236450 | 6.120000 | NaN | NaN | NaN | 7 | 2014 |
| 2 | 2014-07-01 | 02:00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | 4.829907 | 206.564987 | 7.559999 | NaN | NaN | NaN | 7 | 2014 |
| 3 | 2014-07-01 | 03:00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | 5.241679 | 195.945465 | 8.640000 | NaN | NaN | NaN | 7 | 2014 |
| 4 | 2014-07-01 | 04:00:00 | ATL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | 6.120000 | 180.000000 | 10.080000 | NaN | NaN | NaN | 7 | 2014 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 2104123 | 2024-06-30 | 19:00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | 13.004921 | 4.763556 | 29.519999 | 214.0 | 302.0 | 516.0 | 6 | 2024 |
| 2104124 | 2024-06-30 | 20:00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | 14.058450 | 2.935620 | 30.960001 | 52.0 | 169.0 | 221.0 | 6 | 2024 |
| 2104125 | 2024-06-30 | 21:00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | 14.869351 | 6.952864 | 31.680000 | 0.0 | 65.0 | 65.0 | 6 | 2024 |
| 2104126 | 2024-06-30 | 22:00:00 | STL | 0.0 | 0.0 | 0.0 | No significant weather observed | 15.745627 | 10.539137 | 33.119999 | NaN | NaN | NaN | 6 | 2024 |
| 2104127 | 2024-06-30 | 23:00:00 | STL | 0.0 | 0.0 | 1.0 | Cloud development not observed or not observable | 15.281989 | 15.018415 | 32.399998 | 154.0 | 0.0 | 154.0 | 6 | 2024 |
2104128 rows × 15 columns
In [181]:
weather_delay_by_airport_cat = airport_hist_weather_delay.groupby(by=['Weather Code Description', 'airport_code'])['TOTAL_DELAY'].describe()
weather_delay_by_airport_cat
Out[181]:
| count | mean | std | min | 25% | 50% | 75% | max | ||
|---|---|---|---|---|---|---|---|---|---|
| Weather Code Description | airport_code | ||||||||
| Cloud development not observed or not observable | ATL | 16320.0 | 268.157782 | 392.900138 | 0.0 | 42.0 | 136.0 | 332.00 | 5183.0 |
| CLT | 15644.0 | 177.315329 | 268.029009 | 0.0 | 21.0 | 85.0 | 218.00 | 3881.0 | |
| DAL | 11360.0 | 95.837412 | 171.208240 | 0.0 | 0.0 | 38.0 | 115.00 | 2973.0 | |
| DEN | 16043.0 | 307.404351 | 407.220041 | 0.0 | 60.0 | 182.0 | 402.00 | 6493.0 | |
| DFW | 14234.0 | 288.506253 | 405.197407 | 0.0 | 43.0 | 156.0 | 372.00 | 6037.0 | |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| State of the sky generally unchanged | SJC | 4231.0 | 71.722288 | 114.904475 | 0.0 | 0.0 | 30.0 | 94.00 | 1928.0 |
| SJU | 4250.0 | 76.628941 | 138.030321 | 0.0 | 0.0 | 26.0 | 93.00 | 1611.0 | |
| SMF | 3256.0 | 85.958538 | 133.558363 | 0.0 | 0.0 | 40.0 | 114.25 | 1764.0 | |
| SNA | 4592.0 | 96.941855 | 130.200139 | 0.0 | 0.0 | 54.0 | 131.00 | 1820.0 | |
| STL | 4801.0 | 92.754218 | 146.962134 | 0.0 | 0.0 | 39.0 | 120.00 | 1748.0 |
294 rows × 8 columns
In [300]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [302]:
# Correlation
cols_for_corr = ['rain', 'snowfall', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m','total_delay_arr', 'total_delay_dep','TOTAL_DELAY']
airport_weather_corr = airport_hist_weather_delay[cols_for_corr].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(airport_weather_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix: Weather Conditions and Total Delay')
plt.show()
In [299]:
airport_weather_corr
Out[299]:
| rain | snowfall | wind_speed_10m | wind_direction_10m | wind_gusts_10m | total_delay_arr | total_delay_dep | TOTAL_DELAY | |
|---|---|---|---|---|---|---|---|---|
| rain | 1.000000 | 0.005507 | 0.093496 | -0.046700 | 0.150269 | 0.061258 | 0.076129 | 0.080802 |
| snowfall | 0.005507 | 1.000000 | 0.045724 | -0.014191 | 0.042395 | 0.036119 | 0.069299 | 0.062097 |
| wind_speed_10m | 0.093496 | 0.045724 | 1.000000 | 0.055178 | 0.901299 | 0.102762 | 0.070656 | 0.101767 |
| wind_direction_10m | -0.046700 | -0.014191 | 0.055178 | 1.000000 | 0.068393 | 0.019749 | 0.001248 | 0.012256 |
| wind_gusts_10m | 0.150269 | 0.042395 | 0.901299 | 0.068393 | 1.000000 | 0.140419 | 0.104217 | 0.143600 |
| total_delay_arr | 0.061258 | 0.036119 | 0.102762 | 0.019749 | 0.140419 | 1.000000 | 0.447831 | 0.848365 |
| total_delay_dep | 0.076129 | 0.069299 | 0.070656 | 0.001248 | 0.104217 | 0.447831 | 1.000000 | 0.853281 |
| TOTAL_DELAY | 0.080802 | 0.062097 | 0.101767 | 0.012256 | 0.143600 | 0.848365 | 0.853281 | 1.000000 |
Investigating Relationship Between Weather Conditions and Delay¶
In [186]:
# trying to visualize the relationship
import matplotlib.pyplot as plt
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes[0, 0].scatter(airport_hist_weather_delay['rain'], airport_hist_weather_delay['TOTAL_DELAY'], color='blue', alpha=0.5)
axes[0, 0].set_title('Rain vs Total Delay')
axes[0, 0].set_xlabel('Rain (inches)')
axes[0, 0].set_ylabel('Total Delay (minutes)')
axes[0, 1].scatter(airport_hist_weather_delay['snowfall'], airport_hist_weather_delay['TOTAL_DELAY'], color='red', alpha=0.5)
axes[0, 1].set_title('Snowfall vs Total Delay')
axes[0, 1].set_xlabel('Snowfall (inches)')
axes[0, 1].set_ylabel('Total Delay (minutes)')
axes[0, 2].scatter(airport_hist_weather_delay['wind_speed_10m'], airport_hist_weather_delay['TOTAL_DELAY'], color='green', alpha=0.5)
axes[0, 2].set_title('Wind Speed vs Total Delay')
axes[0, 2].set_xlabel('Wind Speed (m/s)')
axes[0, 2].set_ylabel('Total Delay (minutes)')
axes[1, 0].scatter(airport_hist_weather_delay['wind_direction_10m'], airport_hist_weather_delay['TOTAL_DELAY'], color='purple', alpha=0.5)
axes[1, 0].set_title('Wind Direction vs Total Delay')
axes[1, 0].set_xlabel('Wind Direction (degrees)')
axes[1, 0].set_ylabel('Total Delay (minutes)')
axes[1, 1].scatter(airport_hist_weather_delay['wind_gusts_10m'], airport_hist_weather_delay['TOTAL_DELAY'], color='orange', alpha=0.5)
axes[1, 1].set_title('Wind Gusts vs Total Delay')
axes[1, 1].set_xlabel('Wind Gusts (m/s)')
axes[1, 1].set_ylabel('Total Delay (minutes)')
fig.delaxes(axes[1, 2])
plt.tight_layout()
plt.show()
In [187]:
# Ranking Delay by Year and Month to try to get some insights
airport_hist_weather_delay['Month'] = airport_hist_weather_delay['date'].dt.month
airport_hist_weather_delay['Year'] = airport_hist_weather_delay['date'].dt.year
year_month_delay = airport_hist_weather_delay.groupby(by =['Year', 'Month', 'airport_code'])['TOTAL_DELAY'].mean().reset_index()
year_month_delay['Rank'] = year_month_delay.groupby(['Year', 'Month'])['TOTAL_DELAY'].rank(ascending = False)
sorted_year_month_delay = year_month_delay.sort_values(by= ['Year', 'Month', 'Rank'])
In [188]:
sorted_year_month_delay
Out[188]:
| Year | Month | airport_code | TOTAL_DELAY | Rank | |
|---|---|---|---|---|---|
| 9 | 2014 | 7 | LAX | 459.552013 | 1.0 |
| 18 | 2014 | 7 | SFO | 412.316225 | 2.0 |
| 4 | 2014 | 7 | DFW | 331.001692 | 3.0 |
| 3 | 2014 | 7 | DEN | 329.702381 | 4.0 |
| 7 | 2014 | 7 | JFK | 327.257426 | 5.0 |
| ... | ... | ... | ... | ... | ... |
| 2878 | 2024 | 6 | SNA | 143.708716 | 20.0 |
| 2868 | 2024 | 6 | MSY | 143.339114 | 21.0 |
| 2877 | 2024 | 6 | SMF | 134.943431 | 22.0 |
| 2862 | 2024 | 6 | HOU | 129.772455 | 23.0 |
| 2875 | 2024 | 6 | SJC | 105.686598 | 24.0 |
2880 rows × 5 columns
Checking The Differences in Correlation plots for each Airports¶
In [189]:
import pandas as pd
# creating a dictionary of correlation dataframes for each airport
weather_columns = ['rain', 'snowfall', 'wind_speed_10m', 'wind_direction_10m', 'wind_gusts_10m', 'TOTAL_DELAY']
airport_groups = airport_hist_weather_delay.groupby('airport_code')
airport_correlations = {}
for airport, group in airport_groups:
airport_data = group[weather_columns]
correlation_matrix = airport_data.corr()
airport_correlations[airport] = correlation_matrix
In [304]:
# plotting them all
for airport, corr_matrix in airport_correlations.items():
plt.figure(figsize=(10, 5))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('{} Correlation Matrix: Weather Conditions and Total Delay'.format(airport))
plt.show()
Weather Condtions Severity Trends and Delay¶
In [191]:
# Plotting the trends of weather conditions by bins and the delay
custom_general_bins = [0,5,10,15,20,25,30,35,40,60,80,100,120,140,180,220,260,300]
snowfall_bins = pd.cut(airport_hist_weather_delay['snowfall'], bins=custom_general_bins)
rainfall_bins = pd.cut(airport_hist_weather_delay['rain'], bins=custom_general_bins)
windspeed_bins = pd.cut(airport_hist_weather_delay['wind_speed_10m'], bins=custom_general_bins)
windgusts_bins = pd.cut(airport_hist_weather_delay['wind_gusts_10m'], bins=custom_general_bins)
snowfall_delay = airport_hist_weather_delay.groupby(snowfall_bins)['TOTAL_DELAY'].mean()
rainfall_delay = airport_hist_weather_delay.groupby(rainfall_bins)['TOTAL_DELAY'].mean()
windspeed_delay = airport_hist_weather_delay.groupby(windspeed_bins)['TOTAL_DELAY'].mean()
windgust_delay = airport_hist_weather_delay.groupby(windgusts_bins)['TOTAL_DELAY'].mean()
plt.figure(figsize=(10, 6))
plt.plot(snowfall_delay.index.astype(str), snowfall_delay, color='red', label='Snowfall', marker='x')
plt.plot(rainfall_delay.index.astype(str), rainfall_delay, color='green', label='Rainfall', marker='x')
plt.plot(windspeed_delay.index.astype(str), windspeed_delay, color='blue', label='WindSpeed', marker='x')
plt.plot(windgust_delay.index.astype(str), windgust_delay, color='pink', label='WindGusts', marker='x')
plt.title('Weather Delay Trends')
plt.xlabel('Weather Condition (Group)')
plt.ylabel('Average Total Delay (minutes)')
plt.xticks(rotation=45)
plt.legend()
# Display the plot
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\411984819.py:8: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. snowfall_delay = airport_hist_weather_delay.groupby(snowfall_bins)['TOTAL_DELAY'].mean() C:\Users\youss\AppData\Local\Temp\ipykernel_24372\411984819.py:9: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. rainfall_delay = airport_hist_weather_delay.groupby(rainfall_bins)['TOTAL_DELAY'].mean() C:\Users\youss\AppData\Local\Temp\ipykernel_24372\411984819.py:10: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. windspeed_delay = airport_hist_weather_delay.groupby(windspeed_bins)['TOTAL_DELAY'].mean() C:\Users\youss\AppData\Local\Temp\ipykernel_24372\411984819.py:11: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. windgust_delay = airport_hist_weather_delay.groupby(windgusts_bins)['TOTAL_DELAY'].mean()
Graphing the each Weather Conditon Trend and the Delay at Airports¶
In [284]:
custom_bins_wind = [0, 20, 40, 60, 80, 100, 120, 140]
wind_bins = pd.cut(airport_hist_weather_delay['wind_speed_10m'], bins=custom_bins_wind)
grouped_data = airport_hist_weather_delay.groupby(['airport_code', wind_bins])['TOTAL_DELAY'].mean().unstack()
grouped_data.columns = grouped_data.columns.astype(str)
plt.figure(figsize=(12, 6))
for airport in grouped_data.index:
plt.plot(grouped_data.columns.astype(str), grouped_data.loc[airport], marker='o', label=airport)
last_valid_index = grouped_data.loc[airport].last_valid_index()
if last_valid_index is not None:
plt.text(last_valid_index, grouped_data.loc[airport].loc[last_valid_index], airport,
fontsize=8, color=plt.gca().lines[-1].get_color(),
verticalalignment='center', horizontalalignment='left')
plt.title('Total Delay Trends Based on Wind Speed for Each Airport')
plt.xlabel('Wind Speed (Group - km/h)')
plt.ylabel('Average Delay (minutes)')
plt.xticks(rotation=45)
plt.subplots_adjust(right=0.85)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\114460246.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data = airport_hist_weather_delay.groupby(['airport_code', wind_bins])['TOTAL_DELAY'].mean().unstack()
In [281]:
bins = 5
snowfall_bins = pd.cut(airport_hist_weather_delay['snowfall'], bins=bins)
grouped_data = airport_hist_weather_delay.groupby(['airport_code', snowfall_bins])['TOTAL_DELAY'].mean().unstack()
grouped_data.columns = grouped_data.columns.astype(str)
plt.figure(figsize=(12, 6))
for airport in grouped_data.index:
plt.plot(grouped_data.columns.astype(str), grouped_data.loc[airport], marker='o', label=airport)
last_valid_index = grouped_data.loc[airport].last_valid_index()
if last_valid_index is not None:
plt.text(last_valid_index, grouped_data.loc[airport].loc[last_valid_index], airport,
fontsize=8, color=plt.gca().lines[-1].get_color(),
verticalalignment='center', horizontalalignment='left')
plt.title('TOTAL Delay Trends Based on Snowfall for Each Airport')
plt.xlabel('Snowfall (Group - Inches)')
plt.ylabel('Average Delay (minutes)')
plt.xticks(rotation=45)
plt.subplots_adjust(right=0.85)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\755983878.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data = airport_hist_weather_delay.groupby(['airport_code', snowfall_bins])['TOTAL_DELAY'].mean().unstack()
In [283]:
custom_bins_gust = [0, 30, 60, 90, 120, 150, 180, 210,240,270,300]
wind_gust_bins = pd.cut(airport_hist_weather_delay['wind_gusts_10m'], bins=custom_bins_gust)
grouped_data = airport_hist_weather_delay.groupby(['airport_code', wind_gust_bins])['TOTAL_DELAY'].mean().unstack()
grouped_data.columns = grouped_data.columns.astype(str)
plt.figure(figsize=(12, 6))
for airport in grouped_data.index:
plt.plot(grouped_data.columns.astype(str), grouped_data.loc[airport], marker='o', label=airport)
last_valid_index = grouped_data.loc[airport].last_valid_index()
if last_valid_index is not None:
plt.text(last_valid_index, grouped_data.loc[airport].loc[last_valid_index], airport,
fontsize=8, color=plt.gca().lines[-1].get_color(),
verticalalignment='center', horizontalalignment='left')
plt.title('Total Delay Trends Based on Wind Gusts for Each Airport')
plt.xlabel('Wind Gusts (Group - km/h)')
plt.ylabel('Average Delay (minutes)')
plt.xticks(rotation=45)
plt.subplots_adjust(right=0.85)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\3482490949.py:5: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data = airport_hist_weather_delay.groupby(['airport_code', wind_gust_bins])['TOTAL_DELAY'].mean().unstack()
Investigating The Airpots that have consistent trends¶
In [195]:
airports = ['JFK', 'EWR', 'ORD', 'PDX', 'SEA']
filtered_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'].isin(airports)]
weather_factors = ['snowfall']
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
plt.figure(figsize=(8, 6))
plt.scatter(airport_data[factor], airport_data['TOTAL_DELAY'], alpha=0.5)
plt.title(f'{factor.capitalize()} vs Weather Delay at {airport}')
plt.xlabel(f'{factor.capitalize()}')
plt.ylabel('Weather Delay (minutes)')
plt.grid(True)
plt.tight_layout()
plt.show()
In [196]:
airports = ['JFK', 'EWR', 'ORD']
filtered_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'].isin(airports)]
weather_factors = ['snowfall']
bins = 10
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
binned_data = pd.cut(airport_data[factor], bins=bins)
avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
plt.figure(figsize=(8, 6))
plt.plot(avg_delay.index.astype(str), avg_delay, marker='o')
plt.title(f'{factor.capitalize()} vs Weather Delay at {airport}')
plt.xlabel(f'{factor.capitalize()} (Binned)')
plt.ylabel('Average Weather Delay (minutes)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\1799442002.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\1799442002.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\1799442002.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
In [288]:
airports = ['LAX', 'EWR', 'JFK', 'SJU']
filtered_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'].isin(airports)]
weather_factors = ['wind_speed_10m']
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
plt.figure(figsize=(8, 6))
plt.scatter(airport_data[factor], airport_data['TOTAL_DELAY'], alpha=0.5)
plt.title(f'{factor.capitalize()} vs Weather Delay at {airport}')
plt.xlabel(f'{factor.capitalize()}')
plt.ylabel('Weather Delay (minutes)')
plt.grid(True)
plt.tight_layout()
plt.show()
In [43]:
airports = ['LAS', 'EWR', 'JFK', 'SJU']
filtered_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'].isin(airports)]
weather_factors = ['wind_speed_10m']
bins = 10
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
binned_data = pd.cut(airport_data[factor], bins=bins)
avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
# Plot the line plot
plt.figure(figsize=(8, 6))
plt.plot(avg_delay.index.astype(str), avg_delay, marker='o')
plt.title(f'{factor.capitalize()} vs Weather Delay at {airport}')
plt.xlabel(f'{factor.capitalize()} (Binned)')
plt.ylabel('Average Weather Delay (minutes)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2231535221.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2231535221.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2231535221.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2231535221.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
In [44]:
airports = ['EWR', 'MCO', 'LAS']
filtered_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'].isin(airports)]
weather_factors = ['wind_gusts_10m']
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
plt.figure(figsize=(8, 6))
plt.scatter(airport_data[factor], airport_data['TOTAL_DELAY'], alpha=0.5)
plt.title(f'{factor.capitalize()} vs Weather Delay at {airport}')
plt.xlabel(f'{factor.capitalize()}')
plt.ylabel('Weather Delay (minutes)')
plt.grid(True)
plt.tight_layout()
plt.show()
In [45]:
airports = ['EWR', 'LAS', 'MCO']
filtered_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'].isin(airports)]
weather_factors = ['wind_gusts_10m']
bins = 10
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
binned_data = pd.cut(airport_data[factor], bins=bins)
avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
plt.figure(figsize=(8, 6))
plt.plot(avg_delay.index.astype(str), avg_delay, marker='o')
plt.title(f'{factor.capitalize()} vs Weather Delay at {airport}')
plt.xlabel(f'{factor.capitalize()} (Binned)')
plt.ylabel('Average Weather Delay (minutes)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\3391558497.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\3391558497.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\3391558497.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['TOTAL_DELAY'].mean()
Wanted to see which airpot gets the most snowfall and when , was not that helpful , Oh well¶
In [198]:
airports = ['JFK', 'EWR', 'ORD', 'PDX','SEA']
results = {}
for airport in airports:
airport_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'] == airport]
monthly_averages = airport_data.groupby('Month').agg({
'snowfall': 'mean'
}).reset_index()
monthly_averages['Snowfall_Rank'] = monthly_averages['snowfall'].rank(ascending=False)
results[airport] = monthly_averages
for airport, ranking in results.items():
print(f"Ranking for {airport}:")
print(ranking)
print("\n")
Ranking for JFK:
Month snowfall Snowfall_Rank
0 1 0.020022 3.0
1 2 0.026497 1.0
2 3 0.023315 2.0
3 4 0.001429 6.0
4 5 0.000019 7.0
5 6 0.000000 10.5
6 7 0.000000 10.5
7 8 0.000000 10.5
8 9 0.000000 10.5
9 10 0.000009 8.0
10 11 0.002751 5.0
11 12 0.008505 4.0
Ranking for EWR:
Month snowfall Snowfall_Rank
0 1 0.019504 3.0
1 2 0.027961 1.0
2 3 0.024387 2.0
3 4 0.001769 6.0
4 5 0.000047 7.0
5 6 0.000000 10.0
6 7 0.000000 10.0
7 8 0.000000 10.0
8 9 0.000000 10.0
9 10 0.000000 10.0
10 11 0.004074 5.0
11 12 0.009239 4.0
Ranking for ORD:
Month snowfall Snowfall_Rank
0 1 0.031265 1.0
1 2 0.026157 2.0
2 3 0.012241 4.0
3 4 0.009625 6.0
4 5 0.000056 8.0
5 6 0.000000 10.5
6 7 0.000000 10.5
7 8 0.000000 10.5
8 9 0.000000 10.5
9 10 0.002230 7.0
10 11 0.010928 5.0
11 12 0.012466 3.0
Ranking for PDX:
Month snowfall Snowfall_Rank
0 1 0.007489 4.0
1 2 0.019211 1.0
2 3 0.008261 3.0
3 4 0.005386 5.0
4 5 0.000000 9.5
5 6 0.000000 9.5
6 7 0.000000 9.5
7 8 0.000000 9.5
8 9 0.000000 9.5
9 10 0.000000 9.5
10 11 0.000826 6.0
11 12 0.009032 2.0
Ranking for SEA:
Month snowfall Snowfall_Rank
0 1 0.007668 3.0
1 2 0.019345 1.0
2 3 0.004356 4.0
3 4 0.003451 5.0
4 5 0.000000 10.0
5 6 0.000000 10.0
6 7 0.000000 10.0
7 8 0.000000 10.0
8 9 0.000000 10.0
9 10 0.000075 7.0
10 11 0.002557 6.0
11 12 0.012005 2.0
In [199]:
airports = ['JFK', 'EWR', 'ORD', 'SEA', 'PDX']
results = {}
for airport in airports:
airport_data = airport_hist_weather_delay[airport_hist_weather_delay['airport_code'] == airport]
monthly_averages = airport_data.groupby('Month').agg({
'snowfall': 'max'
}).reset_index()
monthly_averages['Snowfall_Rank'] = monthly_averages['snowfall'].rank(ascending=False)
results[airport] = monthly_averages
for airport, ranking in results.items():
print(f"Ranking for {airport}:")
print(ranking)
print("\n")
Ranking for JFK:
Month snowfall Snowfall_Rank
0 1 3.08 3.0
1 2 3.22 2.0
2 3 3.50 1.0
3 4 1.96 6.0
4 5 0.07 7.5
5 6 0.00 10.5
6 7 0.00 10.5
7 8 0.00 10.5
8 9 0.00 10.5
9 10 0.07 7.5
10 11 2.24 5.0
11 12 2.73 4.0
Ranking for EWR:
Month snowfall Snowfall_Rank
0 1 2.45 3.5
1 2 3.36 2.0
2 3 4.41 1.0
3 4 2.17 5.0
4 5 0.14 7.0
5 6 0.00 10.0
6 7 0.00 10.0
7 8 0.00 10.0
8 9 0.00 10.0
9 10 0.00 10.0
10 11 2.45 3.5
11 12 1.96 6.0
Ranking for ORD:
Month snowfall Snowfall_Rank
0 1 2.80 3.0
1 2 3.29 1.0
2 3 1.89 4.5
3 4 1.75 6.0
4 5 0.21 8.0
5 6 0.00 10.5
6 7 0.00 10.5
7 8 0.00 10.5
8 9 0.00 10.5
9 10 1.19 7.0
10 11 2.87 2.0
11 12 1.89 4.5
Ranking for SEA:
Month snowfall Snowfall_Rank
0 1 4.13 1.0
1 2 2.38 2.0
2 3 1.12 6.0
3 4 2.17 3.0
4 5 0.00 10.0
5 6 0.00 10.0
6 7 0.00 10.0
7 8 0.00 10.0
8 9 0.00 10.0
9 10 0.42 7.0
10 11 1.75 5.0
11 12 2.03 4.0
Ranking for PDX:
Month snowfall Snowfall_Rank
0 1 3.22 3.0
1 2 3.57 1.0
2 3 3.29 2.0
3 4 2.03 4.0
4 5 0.00 9.5
5 6 0.00 9.5
6 7 0.00 9.5
7 8 0.00 9.5
8 9 0.00 9.5
9 10 0.00 9.5
10 11 0.63 6.0
11 12 1.68 5.0
In [20]:
import openmeteo_requests
import requests_cache
import pandas as pd
from retry_requests import retry
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)
# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
"latitude": [33.64, 35.21, 32.85, 39.86, 32.9, 40.69, 29.64, 40.64, 36.08, 33.94, 28.43, 25.79, 29.99, 41.97, 45.59, 33.43, 35.88, 47.45, 37.62, 37.36, 18.44, 38.7, 33.68, 38.75],
"longitude": [-84.43, -80.95, -96.85, -104.67, -97.04, -74.17, -95.28, -73.78, -115.15, -118.41, -81.31, -80.29, -90.25, -87.91, -122.6, -112.01, -78.78, -122.31, -122.38, -121.94, -66, -121.59, -117.87, -90.37],
"daily": ["weather_code","rain_sum", "snowfall_sum", "wind_speed_10m_max", "wind_gusts_10m_max"],
"start_date": "2014-07-01",
"end_date": "2024-06-30",
}
responses = openmeteo.weather_api(url, params=params)
dataset_airport_codes = [ x for x in dataset_airport_geo_codes['AIRPORT']]
airport_weather_dataframe_list = []
for row , airport_code in zip(range(len(responses)), dataset_airport_codes):
# Process first location. Add a for-loop for multiple locations or weather models
response = responses[row]
print(f"Coordinates {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation {response.Elevation()} m asl")
print(f"Timezone {response.Timezone()} {response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0 {response.UtcOffsetSeconds()} s")
# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_rain_sum = daily.Variables(1).ValuesAsNumpy()
daily_snowfall_sum = daily.Variables(2).ValuesAsNumpy()
daily_wind_speed_10m_max = daily.Variables(3).ValuesAsNumpy()
daily_wind_gusts_10m_max = daily.Variables(4).ValuesAsNumpy()
daily_data = {"date": pd.date_range(
start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
freq = pd.Timedelta(seconds = daily.Interval()),
inclusive = "left"
)}
daily_data['airport_code'] = airport_code
daily_data["weather_code"] = daily_weather_code
daily_data["rain_sum"] = daily_rain_sum
daily_data["snowfall_sum"] = daily_snowfall_sum
daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
daily_dataframe = pd.DataFrame(data = daily_data)
airport_weather_dataframe_list.append(daily_dataframe)
Coordinates 33.63795852661133°N -84.4168701171875°E Elevation 305.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 35.184532165527344°N -80.93112182617188°E Elevation 213.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 32.86467361450195°N -96.82986450195312°E Elevation 146.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 39.89455032348633°N -104.68618774414062°E Elevation 1633.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 32.93497085571289°N -97.05880737304688°E Elevation 178.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 40.6678352355957°N -74.19265747070312°E Elevation 2.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 29.630929946899414°N -95.31866455078125°E Elevation 12.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 40.6678352355957°N -73.81021118164062°E Elevation 3.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 36.09841537475586°N -115.09727478027344°E Elevation 641.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 33.91915512084961°N -118.39152526855469°E Elevation 36.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 28.43585205078125°N -81.30682373046875°E Elevation 33.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 25.764497756958008°N -80.29409790039062°E Elevation 2.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 29.982423782348633°N -90.20977783203125°E Elevation -1.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 42.00351333618164°N -87.9039306640625°E Elevation 198.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 45.588748931884766°N -122.54718017578125°E Elevation 5.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 33.42706298828125°N -112.02719116210938°E Elevation 340.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 35.88751983642578°N -78.83718872070312°E Elevation 129.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 47.41651916503906°N -122.31147766113281°E Elevation 120.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 37.644989013671875°N -122.44325256347656°E Elevation 1.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 37.36379623413086°N -121.91233825683594°E Elevation 16.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 18.453426361083984°N -65.95889282226562°E Elevation 4.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 38.69947052001953°N -121.63487243652344°E Elevation 6.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 33.70825958251953°N -117.8385009765625°E Elevation 15.0 m asl Timezone None None Timezone difference to GMT+0 0 s Coordinates 38.769771575927734°N -90.36834716796875°E Elevation 161.0 m asl Timezone None None Timezone difference to GMT+0 0 s
In [21]:
# Getting the weather code as this will tell me what was so bad that day
airport_hist_weather_data_daily = pd.concat(airport_weather_dataframe_list, axis=0)
airport_hist_weather_data_daily = pd.merge(airport_hist_weather_data_daily, weather_codes, how="left", on='weather_code')
airport_hist_weather_data_daily['date'] = airport_hist_weather_data_daily['date'].dt.tz_localize(None)
In [22]:
airport_hist_weather_data_daily.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 87672 entries, 0 to 87671 Data columns (total 8 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 date 87672 non-null datetime64[ns] 1 airport_code 87672 non-null object 2 weather_code 87672 non-null float32 3 rain_sum 87672 non-null float32 4 snowfall_sum 87672 non-null float32 5 wind_speed_10m_max 87672 non-null float32 6 wind_gusts_10m_max 87672 non-null float32 7 Weather Code Description 87672 non-null object dtypes: datetime64[ns](1), float32(5), object(2) memory usage: 3.7+ MB
In [23]:
# I only want these rows
us_avaiation_data_cancelled = us_avaiation_data[['FL_DATE','ORIGIN', 'DEST','CANCELLED']]
weather_cancelled = us_avaiation_data_cancelled
Grouping By DEST and ORIGIN and aggregating to get the CANCELLATION RATIO = CANCELED FLIGHTS / TOTAL FLIGHTS¶
In [24]:
arr_cancel_sum = weather_cancelled.groupby(by = ['FL_DATE','DEST']).agg(
canceled_flights_arr=('CANCELLED', lambda x: (x == 1.0).sum()),
non_canceled_flights_arr=('CANCELLED', lambda x: (x == 0.0).sum())
).reset_index()
arr_cancel_sum['total_flights_arr'] = arr_cancel_sum['non_canceled_flights_arr'] + arr_cancel_sum['canceled_flights_arr']
arr_cancel_sum['cancelation_ratio_arr'] = arr_cancel_sum['canceled_flights_arr'] / arr_cancel_sum['total_flights_arr']
dep_cancel_sum = weather_cancelled.groupby(by =['FL_DATE','ORIGIN']).agg(
canceled_flights_dep=('CANCELLED', lambda x: (x == 1.0).sum()),
non_canceled_flights_dep=('CANCELLED', lambda x: (x == 0.0).sum())
).reset_index()
dep_cancel_sum['total_flights_dep'] = dep_cancel_sum['non_canceled_flights_dep'] + dep_cancel_sum['canceled_flights_dep']
arr_cancel_sum['cancelation_ratio_dep'] = dep_cancel_sum['canceled_flights_dep'] / dep_cancel_sum['total_flights_dep']
weather_cancelled = pd.merge(arr_cancel_sum, dep_cancel_sum, left_on=['FL_DATE', 'DEST'] , right_on=['FL_DATE','ORIGIN'], how='outer')
weather_cancelled['total_canceled_flights'] = weather_cancelled['canceled_flights_arr'].fillna(0) + weather_cancelled['canceled_flights_dep'].fillna(0)
weather_cancelled['total_non_canceled_flights'] = weather_cancelled['non_canceled_flights_arr'].fillna(0) + weather_cancelled['non_canceled_flights_dep'].fillna(0)
weather_cancelled['total_flights'] = weather_cancelled['total_flights_arr'].fillna(0) + weather_cancelled['total_flights_dep'].fillna(0)
weather_cancelled['total_cancelation_ratio'] = weather_cancelled['total_canceled_flights'].fillna(0) / weather_cancelled['total_flights'].fillna(0)
weather_cancelled.drop(columns=['ORIGIN', 'canceled_flights_arr', 'non_canceled_flights_arr', 'total_flights_arr', 'canceled_flights_dep',
'non_canceled_flights_dep', 'total_flights_dep'], inplace=True)
weather_cancelled.rename(columns={'DEST': 'airport_code', 'FL_DATE': 'date'}, inplace=True)
In [25]:
weather_cancelled
Out[25]:
| date | airport_code | cancelation_ratio_arr | cancelation_ratio_dep | total_canceled_flights | total_non_canceled_flights | total_flights | total_cancelation_ratio | |
|---|---|---|---|---|---|---|---|---|
| 0 | 2014-07-01 | ATL | 0.000000 | 0.000000 | 0 | 517 | 517 | 0.000000 |
| 1 | 2014-07-01 | CLT | 0.006993 | 0.000000 | 1 | 283 | 284 | 0.003521 |
| 2 | 2014-07-01 | DAL | 0.043478 | 0.065217 | 5 | 87 | 92 | 0.054348 |
| 3 | 2014-07-01 | DEN | 0.004717 | 0.004673 | 2 | 424 | 426 | 0.004695 |
| 4 | 2014-07-01 | DFW | 0.000000 | 0.000000 | 0 | 470 | 470 | 0.000000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 87667 | 2024-06-30 | SJC | 0.000000 | 0.000000 | 0 | 173 | 173 | 0.000000 |
| 87668 | 2024-06-30 | SJU | 0.044118 | 0.014493 | 4 | 133 | 137 | 0.029197 |
| 87669 | 2024-06-30 | SMF | 0.010638 | 0.010638 | 2 | 186 | 188 | 0.010638 |
| 87670 | 2024-06-30 | SNA | 0.000000 | 0.021053 | 2 | 190 | 192 | 0.010417 |
| 87671 | 2024-06-30 | STL | 0.021739 | 0.010753 | 3 | 182 | 185 | 0.016216 |
87672 rows × 8 columns
In [26]:
# merging flight data with weather data and formatting date
airport_hist_weather_daily_data_cancelled = pd.merge(left=airport_hist_weather_data_daily, right=weather_cancelled, how='inner', on = ['date', 'airport_code'])
airport_hist_weather_daily_data_cancelled['Year'] = airport_hist_weather_daily_data_cancelled['date'].dt.year
airport_hist_weather_daily_data_cancelled['Month'] = airport_hist_weather_daily_data_cancelled['date'].dt.month
airport_hist_weather_daily_data_cancelled['Day'] = airport_hist_weather_daily_data_cancelled['date'].dt.day
# rearranging columns
airport_hist_weather_daily_data_cancelled = airport_hist_weather_daily_data_cancelled[['Month', 'Day','weather_code', 'rain_sum', 'snowfall_sum',
'wind_speed_10m_max', 'wind_gusts_10m_max','total_cancelation_ratio']]
In [31]:
airport_hist_weather_daily_data_cancelled
Out[31]:
| Month | Day | airport_code | weather_code | rain_sum | snowfall_sum | wind_speed_10m_max | wind_gusts_10m_max | total_cancelation_ratio | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 7 | 1 | ATL | 3.0 | 0.000000 | 0.0 | 8.217153 | 24.480000 | 0.000000 |
| 1 | 7 | 2 | ATL | 53.0 | 0.500000 | 0.0 | 9.107359 | 25.199999 | 0.013133 |
| 2 | 7 | 3 | ATL | 53.0 | 0.500000 | 0.0 | 20.421871 | 44.279999 | 0.034991 |
| 3 | 7 | 4 | ATL | 3.0 | 0.000000 | 0.0 | 16.179987 | 40.680000 | 0.002525 |
| 4 | 7 | 5 | ATL | 2.0 | 0.000000 | 0.0 | 18.003599 | 38.880001 | 0.002309 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 87667 | 6 | 26 | STL | 63.0 | 28.400000 | 0.0 | 18.003599 | 39.239998 | 0.000000 |
| 87668 | 6 | 27 | STL | 0.0 | 0.000000 | 0.0 | 9.693295 | 31.319998 | 0.000000 |
| 87669 | 6 | 28 | STL | 3.0 | 0.000000 | 0.0 | 20.721119 | 39.599998 | 0.000000 |
| 87670 | 6 | 29 | STL | 63.0 | 24.300001 | 0.0 | 25.364037 | 53.279999 | 0.000000 |
| 87671 | 6 | 30 | STL | 53.0 | 2.600000 | 0.0 | 15.745627 | 33.839996 | 0.016216 |
87672 rows × 9 columns
In [32]:
save_dest = r"C:\Users\youss\Documents\University of Calgary\DATA 601"
save_variable = os.path.join(save_dest, 'cancelation_ratio_dataset.csv')
# airport_hist_weather_daily_data_cancelled.to_csv(save_variable)
In [306]:
cols_for_corr = ['rain_sum', 'snowfall_sum', 'wind_speed_10m_max', 'wind_gusts_10m_max', 'cancelation_ratio_arr', 'cancelation_ratio_dep', 'total_cancelation_ratio']
airport_cancelled_corr = airport_hist_weather_daily_data_cancelled[cols_for_corr].corr()
plt.figure(figsize=(10, 5))
sns.heatmap(airport_cancelled_corr, annot=True, cmap='coolwarm', fmt='.2f')
plt.title('Correlation Matrix: Weather Conditions and Cancellation')
plt.show()
In [263]:
airport_cancelled_corr
Out[263]:
| rain_sum | snowfall_sum | wind_speed_10m_max | wind_gusts_10m_max | cancelation_ratio_arr | cancelation_ratio_dep | total_cancelation_ratio | |
|---|---|---|---|---|---|---|---|
| rain_sum | 1.000000 | 0.005932 | 0.194295 | 0.256995 | 0.108564 | 0.111349 | 0.110789 |
| snowfall_sum | 0.005932 | 1.000000 | 0.066364 | 0.060876 | 0.210727 | 0.234667 | 0.224428 |
| wind_speed_10m_max | 0.194295 | 0.066364 | 1.000000 | 0.907091 | 0.113006 | 0.115158 | 0.114945 |
| wind_gusts_10m_max | 0.256995 | 0.060876 | 0.907091 | 1.000000 | 0.120034 | 0.123608 | 0.122749 |
| cancelation_ratio_arr | 0.108564 | 0.210727 | 0.113006 | 0.120034 | 1.000000 | 0.970394 | 0.992508 |
| cancelation_ratio_dep | 0.111349 | 0.234667 | 0.115158 | 0.123608 | 0.970394 | 1.000000 | 0.992632 |
| total_cancelation_ratio | 0.110789 | 0.224428 | 0.114945 | 0.122749 | 0.992508 | 0.992632 | 1.000000 |
In [318]:
import matplotlib.pyplot as plt
winter_months = [10,11,12,1,2]
fig, axes = plt.subplots(2, 3, figsize=(18, 10))
axes[0, 0].scatter(airport_hist_weather_daily_data_cancelled['rain_sum'], airport_hist_weather_daily_data_cancelled['total_cancelation_ratio'], color='blue', alpha=0.5)
axes[0, 0].set_title('Rain vs Total Cancelation Ratio')
axes[0, 0].set_xlabel('Rain (mm)')
axes[0, 0].set_ylabel('Total Cancelation Ratio')
axes[0, 0].set_xlim(0, 100)
axes[0, 1].scatter(airport_hist_weather_daily_data_cancelled[airport_hist_weather_daily_data_cancelled['Month'].isin(winter_months)]['snowfall_sum'], airport_hist_weather_daily_data_cancelled[airport_hist_weather_daily_data_cancelled['Month'].isin(winter_months)]['total_cancelation_ratio'], color='red', alpha=0.5)
axes[0, 1].set_title('Snowfall vs Total Cancelation Ratio')
axes[0, 1].set_xlabel('Snowfall (inches)')
axes[0, 1].set_ylabel('Total Cancelation Ratio')
axes[0, 2].scatter(airport_hist_weather_daily_data_cancelled['wind_speed_10m_max'], airport_hist_weather_daily_data_cancelled['total_cancelation_ratio'], color='green', alpha=0.5)
axes[0, 2].set_title('Wind Speed vs Total Cancelation Ratio')
axes[0, 2].set_xlabel('Wind Speed (m/s)')
axes[0, 2].set_ylabel('Cancelation Ratio')
axes[0, 0].set_xlim(0, 85)
axes[1, 1].scatter(airport_hist_weather_daily_data_cancelled['wind_gusts_10m_max'], airport_hist_weather_daily_data_cancelled['total_cancelation_ratio'], color='orange', alpha=0.5)
axes[1, 1].set_title('Wind Gusts vs Total Cancelation Ratio')
axes[1, 1].set_xlabel('Wind Gusts (m/s)')
axes[1, 1].set_ylabel('Total Cancelation Ratio')
axes[0, 0].set_xlim(0, 150)
fig.delaxes(axes[1, 2])
plt.tight_layout()
plt.show()
In [308]:
cancel_bins = pd.cut(airport_hist_weather_daily_data_cancelled['snowfall_sum'], bins=5)
grouped_data = airport_hist_weather_daily_data_cancelled.groupby(['airport_code', cancel_bins])['total_cancelation_ratio'].mean().unstack()
grouped_data.columns = grouped_data.columns.astype(str)
plt.figure(figsize=(12, 6))
for airport in grouped_data.index:
plt.plot(grouped_data.columns.astype(str), grouped_data.loc[airport], marker='o', label=airport)
last_valid_index = grouped_data.loc[airport].last_valid_index()
if last_valid_index is not None:
plt.text(last_valid_index, grouped_data.loc[airport].loc[last_valid_index], airport,
fontsize=8, color=plt.gca().lines[-1].get_color(),
verticalalignment='center', horizontalalignment='left')
plt.title('Cancellation Ratio Based on Snowfall for Each Airport')
plt.xlabel('Snowfall (Group - Inches)')
plt.ylabel('Average Cancellation Ratio')
plt.xticks(rotation=45)
plt.subplots_adjust(right=0.85)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\410197625.py:3: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data = airport_hist_weather_daily_data_cancelled.groupby(['airport_code', cancel_bins])['total_cancelation_ratio'].mean().unstack()
In [271]:
airports = ['DEN', 'EWR', 'JFK', 'RDU', 'ORD']
filtered_data = airport_hist_weather_daily_data_cancelled[airport_hist_weather_daily_data_cancelled['airport_code'].isin(airports)]
weather_factors = ['snowfall_sum']
bins = 10
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
binned_data = pd.cut(airport_data[factor], bins=bins)
avg_delay = airport_data.groupby(binned_data)['total_cancelation_ratio'].mean()
plt.figure(figsize=(8, 6))
plt.plot(avg_delay.index.astype(str), avg_delay, marker='o')
plt.title(f'{factor.capitalize()} vs Total Cancellation Ratio at {airport}')
plt.xlabel(f'{factor.capitalize()} (Binned)')
plt.ylabel('Average Weather Delay (minutes)')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2081834017.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['total_cancelation_ratio'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2081834017.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['total_cancelation_ratio'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2081834017.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['total_cancelation_ratio'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2081834017.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['total_cancelation_ratio'].mean()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\2081834017.py:13: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. avg_delay = airport_data.groupby(binned_data)['total_cancelation_ratio'].mean()
In [272]:
airports = ['JFK', 'EWR', 'ORD','DEN', 'RDU']
filtered_data = airport_hist_weather_daily_data_cancelled[airport_hist_weather_daily_data_cancelled['airport_code'].isin(airports)]
weather_factors = ['snowfall_sum']
for airport in airports:
airport_data = filtered_data[filtered_data['airport_code'] == airport]
for factor in weather_factors:
plt.figure(figsize=(8, 6))
plt.scatter(airport_data[factor], airport_data['total_cancelation_ratio'], alpha=0.5)
plt.title(f'{factor.capitalize()} vs Total Cancellation Ratio at {airport}')
plt.xlabel(f'{factor.capitalize()}')
plt.ylabel('Cancellation Ratio')
plt.grid(True)
plt.tight_layout()
plt.show()
In [276]:
custom_bins_gust = [0, 30, 60, 90, 120, 150, 180, 210,240,270,300]
cancel_bins_gust = pd.cut(airport_hist_weather_daily_data_cancelled['wind_gusts_10m_max'], bins=custom_bins_gust)
grouped_data = airport_hist_weather_daily_data_cancelled.groupby(['airport_code', cancel_bins_gust])['cancelation_ratio_arr'].mean().unstack()
grouped_data.columns = grouped_data.columns.astype(str)
plt.figure(figsize=(12, 6))
for airport in grouped_data.index:
plt.plot(grouped_data.columns.astype(str), grouped_data.loc[airport], marker='o', label=airport)
last_valid_index = grouped_data.loc[airport].last_valid_index()
if last_valid_index is not None:
plt.text(last_valid_index, grouped_data.loc[airport].loc[last_valid_index], airport,
fontsize=8, color=plt.gca().lines[-1].get_color(),
verticalalignment='center', horizontalalignment='left')
plt.title('Cancellation Ratio Based on Max Wind Gust Daily for Each Airport (Custom Bins)')
plt.xlabel('Daily Wind Gust Max (Group)')
plt.ylabel('Average Cancellation Ratio')
plt.xticks(rotation=45)
plt.subplots_adjust(right=0.85)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\3528284890.py:6: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data = airport_hist_weather_daily_data_cancelled.groupby(['airport_code', cancel_bins_gust])['cancelation_ratio_arr'].mean().unstack()
In [278]:
custom_bins_wind = [0, 20, 40, 60, 80, 100, 120, 140]
cancel_bins_wind = pd.cut(airport_hist_weather_daily_data_cancelled['wind_speed_10m_max'], bins=custom_bins_wind)
grouped_data = airport_hist_weather_daily_data_cancelled.groupby(['airport_code', cancel_bins_wind])['cancelation_ratio_dep'].mean().unstack()
grouped_data.columns = grouped_data.columns.astype(str)
plt.figure(figsize=(12, 6))
for airport in grouped_data.index:
plt.plot(grouped_data.columns.astype(str), grouped_data.loc[airport], marker='o', label=airport)
last_valid_index = grouped_data.loc[airport].last_valid_index()
if last_valid_index is not None:
plt.text(last_valid_index, grouped_data.loc[airport].loc[last_valid_index], airport,
fontsize=8, color=plt.gca().lines[-1].get_color(),
verticalalignment='center', horizontalalignment='left')
plt.title('Cancellation Ratio Trends Based on Wind Speed for Each Airport (Custom Bins)')
plt.xlabel('Wind Speed (Group)')
plt.ylabel('Average Delay (minutes)')
plt.xticks(rotation=45)
plt.subplots_adjust(right=0.85)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\898835844.py:4: FutureWarning: The default of observed=False is deprecated and will be changed to True in a future version of pandas. Pass observed=False to retain current behavior or observed=True to adopt the future default and silence this warning. grouped_data = airport_hist_weather_daily_data_cancelled.groupby(['airport_code', cancel_bins_wind])['cancelation_ratio_dep'].mean().unstack()
In [314]:
import matplotlib.pyplot as plt
import seaborn as sns
airports = ['JFK', 'EWR', 'ORD', 'SEA', 'PDX', 'DEN']
results = {}
for airport in airports:
airport_data = airport_hist_weather_daily_data_cancelled[airport_hist_weather_daily_data_cancelled['airport_code'] == airport]
results[airport] = airport_data[['Month', 'snowfall_sum']]
plt.figure(figsize=(10, 6))
combined_data = []
for airport, data in results.items():
data['Airport'] = airport
combined_data.append(data)
combined_df = pd.concat(combined_data)
sns.boxplot(x='Airport', y='snowfall_sum', data=combined_df)
plt.yscale('log')
plt.title('Snowfall Sum Distribution Across Airports')
plt.ylabel('Snowfall Sum (in mm) - Log Scale')
plt.xlabel('Airport')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
C:\Users\youss\AppData\Local\Temp\ipykernel_24372\3295475754.py:16: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy data['Airport'] = airport
Linear Regression Attempt¶
In [47]:
import sklearn
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
In [39]:
# assigning X and Y variables
X = airport_hist_weather_daily_data_cancelled.drop(columns=['total_cancelation_ratio'])
y = airport_hist_weather_daily_data_cancelled['total_cancelation_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scaling Values
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
In [42]:
# Using Polynomial features to get the second moment, We know the relationship is not linear
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)
In [43]:
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(X_train_poly, y_train)
Out[43]:
Ridge()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Ridge()
In [44]:
y_pred_ridge = ridge_model.predict(X_test_poly)
In [45]:
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r_squared_ridge = ridge_model.score(X_test_poly, y_test)
print(f'Ridge MSE: {mse_ridge}')
print(f'Ridge R-Squared: {r_squared_ridge}')
Ridge MSE: 0.003492344120309727 Ridge R-Squared: 0.04941913369656237
In [48]:
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_ridge, alpha=0.5, color='b')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=3) # Ideal line
plt.xlabel('Actual Cancellation Ratio')
plt.ylabel('Predicted Cancellation Ratio')
plt.title('Actual vs Predicted Cancellation Ratio (Ridge Regression with Polynomial Features)')
plt.show()
In [49]:
plt.figure(figsize=(10, 6))
residuals = y_test - y_pred_ridge
plt.scatter(y_pred_ridge, residuals, alpha=0.5, color='r')
plt.axhline(y=0, color='k', linestyle='--', lw=3)
plt.xlabel('Predicted Cancellation Ratio')
plt.ylabel('Residuals')
plt.title('Residual Plot (Ridge Regression with Polynomial Features)')
plt.show()
In [50]:
coefficients_ridge = ridge_model.coef_
poly_feature_names = poly.get_feature_names_out(X.columns)
ridge_coefficients_df = pd.DataFrame({
'Feature': poly_feature_names,
'Coefficient': coefficients_ridge
})
ridge_coefficients_df_sorted = ridge_coefficients_df.reindex(
ridge_coefficients_df.Coefficient.abs().sort_values(ascending=False).index
)
ridge_coefficients_df_sorted.head()
Out[50]:
| Feature | Coefficient | |
|---|---|---|
| 4 | snowfall_sum | 0.025039 |
| 3 | rain_sum | -0.017247 |
| 21 | weather_code rain_sum | 0.014123 |
| 22 | weather_code snowfall_sum | -0.009677 |
| 2 | weather_code | 0.006926 |
Feature Importance¶
In [51]:
top_coefficients = ridge_coefficients_df_sorted.head(10)
plt.figure(figsize=(10, 6))
plt.barh(top_coefficients['Feature'], top_coefficients['Coefficient'], color='c')
plt.xlabel('Coefficient Value')
plt.ylabel('Feature')
plt.title('Top 10 Coefficients (Ridge Regression with Polynomial Features)')
plt.show()